Data import

library (data.table)
library(ggbiplot)
dt_odds<- readRDS("~/Documents/Dersler/Fall-2018/IE-582/df9b1196-e3cf-4cc7-9159-f236fe738215_odd_details.rds")
dt_matches <- readRDS("~/Documents/Dersler/Fall-2018/IE-582/df9b1196-e3cf-4cc7-9159-f236fe738215_matches.rds")

Data manipulation

# Filtering 5 bookmarkers and final odds

dt_odds<-dt_odds[bookmaker %in% c("10Bet","bet365","Betsafe","bwin","bet-at-home")]

dt_oddsf<-dt_odds[,rnk:=rank(-date,ties.method= "max"),
                            by = list(matchId,betType,oddtype,bookmaker)][rnk==1]

dt_oddss<-dt_oddsf[,.(matchId,bookmaker,betType,oddtype,odd)][betType %in% c("1x2","bts","ou","ha")]

#transpoze for bet
dt_oddst<-dcast(dt_oddss,matchId+bookmaker~betType+oddtype)

#remove null odd (bcs we have enough instance)
setnames(dt_oddst,c('1x2_odd1','1x2_odd2','1x2_oddX'),c("Home","Away","Tie"))
dt_oddst<-na.omit(dt_oddst)
# split score to compare
dt_matches[,c("score1","score2"):= tstrsplit(score,":",fixed=TRUE)]
dt_matches <- dt_matches[,score1:=as.integer(score1)] # to be ensure for calculation
dt_matches <- dt_matches[,score2:=as.integer(score2)]
dt_matches<-na.omit(dt_matches)
# add column to decide Overstatus and Outcometype
dt_matches$totalgoal <- dt_matches$score1 + dt_matches$score2
dt_matches$Overstatus <- ifelse(dt_matches$totalgoal>2.5,"Over", "Under")
dt_matches$Outcomes<-ifelse(dt_matches$score1>dt_matches$score2,"Home",ifelse(dt_matches$score1<dt_matches$score2,"Away","Tie"))
# I create setkey to join tables

setkey(dt_oddst,matchId)
setkey(dt_matches,matchId)

# I joined tables 
dt_merged <- merge(dt_oddst,unique(dt_matches),all.x=TRUE)[,.(matchId,Home,Away,Tie,bts_NO,bts_YES,ou_over,ou_under,Overstatus,bookmaker,ha_1,ha_2)]
dt_merged<-na.omit(dt_merged)

dt_merged2 <- merge(dt_oddst,unique(dt_matches),all.x=TRUE)[,.(matchId,Home,Away,Tie,bts_NO,bts_YES,ou_over,ou_under,Outcomes,bookmaker,ha_1,ha_2)]
dt_merged2<-na.omit(dt_merged2)

Task 1

  • PCA Analysis
#filter data (I used the first 250 instance to plot)

dt_merged<-dt_merged[1:250,.(Home,Away,Tie,bts_NO,bts_YES,ou_over,ou_under,ha_1,ha_2,Overstatus,bookmaker)]
dt_merged.pca <- prcomp(dt_merged[,1:9],center = TRUE,scale. = TRUE)
summary(dt_merged.pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.9544 1.3813 1.1062 0.9924 0.9866 0.21786 0.15656
## Proportion of Variance 0.4244 0.2120 0.1360 0.1094 0.1082 0.00527 0.00272
## Cumulative Proportion  0.4244 0.6364 0.7723 0.8818 0.9899 0.99522 0.99795
##                            PC8     PC9
## Standard deviation     0.10562 0.08553
## Proportion of Variance 0.00124 0.00081
## Cumulative Proportion  0.99919 1.00000

PC1 encompass %43 of total variance, together with PC2 involves %65 of variance. They can provide insight to data

ggbiplot(dt_merged.pca,groups=dt_merged$Overstatus,scale = 0,choices = 1:2,obs.scale = 1, var.scale = 1,ellipse = TRUE)

Away , ha_2 and ou_under Tie feature are located close to each other. This indicates that variables are correlated with each other. Also Home , ha_1 , ou_over variables are located closely, they are also correlated with each other. These feature may combine to feature reduction

Over and Under results correlated, they generally ploted same regions

  • MDS plotting for euclidean distance
d_mcd<-dist(dt_merged[,-10],method = "euclidean")
MDS = cmdscale(d_mcd)
plot(MDS[, 1], MDS[, 2], pch = 21, bg = c("red", "green"))

     #[unclass(dt_merged$Overstatus)])
  • MDS plotting for manhattan distance
d_mcd<-dist(dt_merged[,-10],method = "manhattan")
MDS = cmdscale(d_mcd)
plot(MDS[, 1], MDS[, 2], pch = 21, bg = c("red", "green"))

Variables in Manhattan distance has smaller variance than euclidean distance. MDS plotting show us Over and Under have high correlation like PCA results

Task2

# merge and filter matches outcome
dt_merged2<-dt_merged2[1:250,.(Home,Away,Tie,bts_NO,bts_YES,ou_over,ou_under,ha_1,ha_2,Outcomes,bookmaker)]
dt_merged2.pca <- prcomp(dt_merged2[,1:9],center = TRUE,scale. = TRUE)
#names(dt_bet365.pca)
print(dt_merged2.pca)
## Standard deviations (1, .., p=9):
## [1] 1.95435060 1.38128540 1.10623624 0.99243457 0.98662598 0.21786416
## [7] 0.15656391 0.10562251 0.08553013
## 
## Rotation (n x k) = (9 x 9):
##                   PC1          PC2        PC3         PC4         PC5
## Home      0.298677020 -0.549597956  0.2501227 -0.04213578 -0.02086839
## Away     -0.491049218  0.022279585  0.2178073 -0.09586584 -0.04514108
## Tie      -0.392787000 -0.192246939  0.4794345 -0.19899760 -0.08289685
## bts_NO    0.300073950  0.414635221  0.4655681 -0.20154250  0.01536690
## bts_YES  -0.331035896 -0.410437232 -0.4130381  0.16102173  0.04649162
## ou_over   0.004984756 -0.069422513 -0.2524744 -0.80110990  0.53683934
## ou_under -0.055482200  0.009252048  0.2603544  0.47815914  0.83571302
## ha_1      0.275068811 -0.561315686  0.2858488 -0.05597631 -0.02880088
## ha_2     -0.486812040  0.011195932  0.2440285 -0.09664530 -0.02854943
##                  PC6         PC7         PC8          PC9
## Home      0.03964179 -0.01513666 -0.21726939  0.703530583
## Away      0.12136258 -0.28501821  0.69690309  0.343610420
## Tie      -0.17229441  0.68731715 -0.06180683 -0.162849119
## bts_NO    0.68732606  0.06606402 -0.05024250 -0.033440085
## bts_YES   0.69124996  0.18438629 -0.07172082 -0.079359162
## ou_over  -0.03406611 -0.01438613  0.00676774  0.005055848
## ou_under -0.03713158  0.02018891  0.01453152 -0.001338908
## ha_1      0.03120514 -0.32673762  0.26390753 -0.588397025
## ha_2      0.01605154 -0.54808381 -0.62106978 -0.082475657
summary(dt_merged2.pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.9544 1.3813 1.1062 0.9924 0.9866 0.21786 0.15656
## Proportion of Variance 0.4244 0.2120 0.1360 0.1094 0.1082 0.00527 0.00272
## Cumulative Proportion  0.4244 0.6364 0.7723 0.8818 0.9899 0.99522 0.99795
##                            PC8     PC9
## Standard deviation     0.10562 0.08553
## Proportion of Variance 0.00124 0.00081
## Cumulative Proportion  0.99919 1.00000

PC1 encompass %43 of total variance, together with PC2 involves %65 of variance.

ggbiplot(dt_merged2.pca,groups=dt_merged2$Outcomes,scale = 0,choices = 1:2,obs.scale = 1, var.scale = 1,ellipse = TRUE)

Matches results are correlated each other. They are located similarly

Task 3

  • Read Image and Plot
library(jpeg)
image<-readJPEG("~/Documents/Dersler/Fall-2018/IE-582/image.jpg")
plot(c(0, 512), c(0, 512), type = "n", xlab = "", ylab = "")
rasterImage(image, 0, 0, 512, 512)

Data stored as numeric and dimensions are [1:512, 1:512, 1:3]

  • Plot according to color channel
# break into channel
r <- image[,,1]
g <- image[,,2]
b <- image[,,3]

#Multi plotting
par(mfrow=c(1,3))
image(t(r)[ncol(r):1,nrow(r):1])
image(t(g)[ncol(g):1,nrow(g):1])
image(t(b)[ncol(b):1,nrow(b):1])

  • Crating Noisy Image
# add noisy for every channel and pixel
for (i in 1:512)
  {for (j in 1:512)
      {for (z in 1:3)
             {
        image[i,j,z]=image[i,j,z]+runif(1,min=0,max=0.1)
        if(image[i,j,z]>1){
              
                image[i,j,z]=0.99
                     }
                else {image[i,j,z]}
        
                }
        
        }
  
}

# Plot
plot(c(0, 512), c(0, 512), type = "n", xlab = "", ylab = "")
rasterImage(image, 0, 0, 512, 512)

  • Noisy Image Channel Plotting
#break into channel
r <- image[,,1]
g <- image[,,2]
b <- image[,,3]

par(mfrow=c(1,3))
image(t(r)[ncol(r):1,nrow(r):1])
image(t(g)[ncol(g):1,nrow(g):1])
image(t(b)[ncol(b):1,nrow(b):1])

  • PCA analysis for grayscale image
library(jpeg)
imagea=readJPEG("~/Documents/Dersler/Fall-2018/IE-582/imagegg.jpg")
plot(c(0, 512), c(0, 512), type = "n", xlab = "", ylab = "")
rasterImage(imagea, 0, 0, 512, 512)

  • Patch Extractions
# Loop is for extraction
n=0
m=0
mt<-matrix(0,nrow=260100,ncol = 9)
for (i in 2:510)
  {for (j in 2:510)
    { n=0
    m=m+1
      for (k in -1:1) 
         {for (l in -1:1) 
         {
            n=n+1
             mt[m,n]= imagea[i+k,j+l]
       # print(c(i+k,j+l))
       # print(c(m,n))
          
        }  
        
      }
  }
}
 
# PCA analysis
mt.pca<-prcomp(mt[,1:9],center = FALSE,scale. = FALSE)     
summary(mt.pca)
## Importance of components:
##                           PC1     PC2    PC3     PC4     PC5     PC6
## Standard deviation     2.5876 0.04743 0.0449 0.01681 0.01347 0.01098
## Proportion of Variance 0.9993 0.00034 0.0003 0.00004 0.00003 0.00002
## Cumulative Proportion  0.9993 0.99960 0.9999 0.99995 0.99997 0.99999
##                             PC7      PC8      PC9
## Standard deviation     0.004988 0.004759 0.003043
## Proportion of Variance 0.000000 0.000000 0.000000
## Cumulative Proportion  1.000000 1.000000 1.000000

PC1 consist 0.99 of total variance. It means that ??t provide insight for %99 of data

  • Reconstructing for first component
#recall loops

deneme<-mt.pca$x[,1]%*%t(mt.pca$rotation[,1])
recall<-matrix(0,nrow=512,ncol = 512)

n=10
m=260101
for (i in 510:2)
  {for (j in 510:2)
    { n=10
     m=m-1
      for (k in 1:-1) 
         {for (l in 1:-1) 
         {
            n=n-1
            if(deneme[m,n]>1)
            {deneme[m,n]=1}
            
            recall[i+k,j+l]=deneme[m,n] 
  
        }  
        
      }
  }
}

#Plot
plot(c(0, 512), c(0, 512), type = "n", xlab = "", ylab = "")
rasterImage(recall, 0, 0, 512, 512)

  • For second component
deneme2<-mt.pca$x[,2]%*%t(mt.pca$rotation[,2])

recall2<-matrix(0,nrow=512,ncol = 512)

n=10
m=260101
#mt<-matrix(0,nrow=260100,ncol = 9)
for (i in 510:2)
  {for (j in 510:2)
    { n=10
     m=m-1
      for (k in 1:-1) 
         {for (l in 1:-1) 
         {
            n=n-1
            if(deneme2[m,n]<0)
            {deneme2[m,n]=0}
            
            recall2[i+k,j+l]=deneme2[m,n] 
                  
        }  
        
      }
  }
}


plot(c(0, 512), c(0, 512), type = "n", xlab = "", ylab = "")
rasterImage(recall2, 0, 0, 512, 512)

  • For third component
deneme3<-mt.pca$x[,3]%*%t(mt.pca$rotation[,3])

recall3<-matrix(0,nrow=512,ncol = 512)

n=10
m=260101
#mt<-matrix(0,nrow=260100,ncol = 9)
for (i in 510:2)
  {for (j in 510:2)
    { n=10
     m=m-1
      for (k in 1:-1) 
         {for (l in 1:-1) 
         {
            n=n-1
            
            if(deneme3[m,n]<0)
            {deneme3[m,n]=0}
            
            recall3[i+k,j+l]=deneme3[m,n] 
                  
  
          
        }  
        
      }
  }
}


plot(c(0, 512), c(0, 512), type = "n", xlab = "", ylab = "")
rasterImage(recall3, 0, 0, 512, 512)

  • Task 4c for first component
#Find Eigenvectors and transform image
tsk4<-mt.pca$rotation[,1]
imagev<-matrix(0,nrow=3,ncol = 3)

m=9
for (i in 1)
  {for (j in 1)
    { 
     
      imagev[i,j]=tsk4[m]*-1
  
    
        }  
        
      }



plot(c(0, 3), c(0, 3), type = "n", xlab = "", ylab = "")
rasterImage(imagev, 0, 0, 3, 3)

  • Task 4c for second component
tsk42<-mt.pca$rotation[2,]
imagev2<-matrix(0,nrow=3,ncol = 3)

m=9
#mt<-matrix(0,nrow=260100,ncol = 9)
for (i in 1)
  {for (j in 1)
    { 
     
      imagev2[i,j]=tsk42[m]
  
    
        }  
        
      }



plot(c(0, 3), c(0, 3), type = "n", xlab = "", ylab = "")
rasterImage(imagev2, 0, 0, 3, 3)

tsk43<-mt.pca$rotation[3,]
imagev3<-matrix(0,nrow=3,ncol = 3)

m=9
for (i in 1)
  {for (j in 1)
    { 
     
      imagev3[i,j]=tsk43[m]* -1
  
    
        }  
        
      }



plot(c(0, 3), c(0, 3), type = "n", xlab = "", ylab = "")
rasterImage(imagev3, 0, 0, 3, 3)